#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
const char* gemm_conv1x1_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define GLOBAL_SIZE_DIM2 "" __private int global_size_dim0,__private int global_size_dim1,\n"
"#define UNIFORM_BOUNDRY_CHECK(index0, index1) "" if(index0 >= global_size_dim0 || index1 >= global_size_dim1) { "" return; "" }\n"
"#define UCHAR4_TO_CHAR8(a, c) "" a.s0=(c.s0 >> 4)-8; a.s1=(c.s0 & 15)-8; a.s2=(c.s1 >> 4)-8; a.s3=(c.s1 & 15)-8; a.s4=(c.s2 >> 4)-8; a.s5=(c.s2 & 15)-8; a.s6=(c.s3 >> 4)-8; a.s7=(c.s3 & 15)-8;\n"
"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
"__kernel void inverse_quant_weight(GLOBAL_SIZE_DIM2\n"
" #ifdef USE_IMAGE\n"
" __read_only image2d_t weight,\n"
" #else\n"
" #if QUANT_BIT == 8\n"
" __global const char *weight,\n"
" #else\n"
" __global const uchar *weight,\n"
" #endif\n"
" #endif\n"
" __global const FLOAT *dequantScaleOffset,\n"
" __global FLOAT* output,\n"
" __private const int inputChannel,\n"
" __private const int inputChannel4Align,\n"
" __private const int outputChannelAlign,\n"
" __private const int outputChannel4Align,\n"
" __private const int blockDim,\n"
" __private const float coef){\n"
" const int x=get_global_id(0); //ic\n"
" const int y=get_global_id(1); //oc\n"
" UNIFORM_BOUNDRY_CHECK(x,y);\n"
" \n"
"#if QUANT_BIT == 4\n"
" const int ic=x << 2;\n"
" const int oc=y << 3;\n"
" const int output_offset=ic*outputChannelAlign+oc;\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT8 scale,offset;\n"
" {\n"
" COMPUTE_FLOAT16 ScaleOffset=CONVERT_COMPUTE_FLOAT16(convert_float16(vload16(0,dequantScaleOffset+((ic/blockDim)*outputChannel4Align+oc)*2))/coef);\n"
" scale=ScaleOffset.s02468ace;\n"
" offset=ScaleOffset.s13579bdf;\n"
" }\n"
" #else\n"
" COMPUTE_FLOAT8 scale=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(0,dequantScaleOffset+(ic/blockDim)*outputChannel4Align+oc))/coef);\n"
" #endif\n"
" COMPUTE_FLOAT8 weights0,weights1,weights2,weights3;\n"
" {\n"
" #ifdef USE_IMAGE\n"
" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(x,y)));\n"
" #else\n"
" uchar16 charWeightsInt40=vload16(x,weight+y*inputChannel4Align*4);\n"
" #endif\n"
" char8 charWeights0;\n"
" #ifdef ASYMMETRIC\n"
" UCHAR4_TO_CHAR8(charWeights0,charWeightsInt40.s0123);\n"
" weights0=CONVERT_COMPUTE_FLOAT8(charWeights0)*scale+offset;\n"
" \n"
" UCHAR4_TO_CHAR8(charWeights0,charWeightsInt40.s4567);\n"
" weights1=ic+1 >= inputChannel ? 0 : CONVERT_COMPUTE_FLOAT8(charWeights0)*scale+offset;\n"
" \n"
" UCHAR4_TO_CHAR8(charWeights0,charWeightsInt40.s89ab);\n"
" weights2=ic+2 >= inputChannel ? 0 : CONVERT_COMPUTE_FLOAT8(charWeights0)*scale+offset;\n"
" \n"
" UCHAR4_TO_CHAR8(charWeights0,charWeightsInt40.scdef);\n"
" weights3=ic+3 >= inputChannel ? 0 : CONVERT_COMPUTE_FLOAT8(charWeights0)*scale+offset;\n"
" #else\n"
" UCHAR4_TO_CHAR8(charWeights0,charWeightsInt40.s0123);\n"
" weights0=CONVERT_COMPUTE_FLOAT8(charWeights0)*scale;\n"
" \n"
" UCHAR4_TO_CHAR8(charWeights0,charWeightsInt40.s4567);\n"
" weights1=ic+1 >= inputChannel ? 0 : CONVERT_COMPUTE_FLOAT8(charWeights0)*scale;\n"
" \n"
" UCHAR4_TO_CHAR8(charWeights0,charWeightsInt40.s89ab);\n"
" weights2=ic+2 >= inputChannel ? 0 : CONVERT_COMPUTE_FLOAT8(charWeights0)*scale;\n"
" \n"
" UCHAR4_TO_CHAR8(charWeights0,charWeightsInt40.scdef);\n"
" weights3=ic+3 >= inputChannel ? 0 : CONVERT_COMPUTE_FLOAT8(charWeights0)*scale;\n"
" #endif\n"
" }\n"
" vstore8(CONVERT_FLOAT8(weights0),0,output+output_offset);\n"
" vstore8(CONVERT_FLOAT8(weights1),0,output+output_offset+outputChannelAlign);\n"
" vstore8(CONVERT_FLOAT8(weights2),0,output+output_offset+2*outputChannelAlign);\n"
" vstore8(CONVERT_FLOAT8(weights3),0,output+output_offset+3*outputChannelAlign);\n"
"#elif QUANT_BIT == 8\n"
" const int ic=x << 1;\n"
" const int oc=y << 3;\n"
" const int output_offset=ic*outputChannelAlign+oc;\n"
" \n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT8 scale,offset;\n"
" {\n"
" COMPUTE_FLOAT16 ScaleOffset=CONVERT_COMPUTE_FLOAT16(convert_float16(vload16(0,dequantScaleOffset+((ic/blockDim)*outputChannel4Align+oc)*2))/coef);\n"
" scale=ScaleOffset.s02468ace;\n"
" offset=ScaleOffset.s13579bdf;\n"
" }\n"
" #else\n"
" COMPUTE_FLOAT8 scale=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(0,dequantScaleOffset+(ic/blockDim)*outputChannel4Align+oc))/coef);\n"
" #endif\n"
" COMPUTE_FLOAT8 weights0,weights1;\n"
" {\n"
" #ifdef USE_IMAGE\n"
" COMPUTE_FLOAT16 wei=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(x,y))));\n"
" #else\n"
" COMPUTE_FLOAT16 wei=CONVERT_COMPUTE_FLOAT16(vload16(x,weight+y*inputChannel4Align*8));\n"
" #endif\n"
" #ifdef ASYMMETRIC\n"
" weights0=wei.s01234567*scale+offset;\n"
" weights1=ic+1 >= inputChannel ? 0 : wei.s89abcdef*scale+offset;\n"
" #else\n"
" weights0=wei.s01234567*scale;\n"
" weights1=ic+1 >= inputChannel ? 0 : wei.s89abcdef*scale;\n"
" #endif\n"
" }\n"
" vstore8(CONVERT_FLOAT8(weights0),0,output+output_offset);\n"
" vstore8(CONVERT_FLOAT8(weights1),0,output+output_offset+outputChannelAlign);\n"
" #endif\n"
"}\n"
"__kernel void gemm_c4nhw4_to_nhwc(GLOBAL_SIZE_DIM2\n"
"__global const FLOAT* input,\n"
"__global FLOAT* output,\n"
"__private const int bhw,\n"
"__private const int channel,\n"
"__private const int channelAlign\n"
"){\n"
" const int x=get_global_id(0); //b/4\n"
" const int y=get_global_id(1); //c/4\n"
" UNIFORM_BOUNDRY_CHECK(x,y);\n"
" const int out_b_idx=x << 2;\n"
" const int out_c_idx=y << 2;\n"
" const int bhw4=bhw << 2;\n"
" const int input_offset=y*bhw4+out_b_idx*4;\n"
" FLOAT4 in0,in1,in2,in3;\n"
" if(out_c_idx+3<channel && out_b_idx+3<bhw){\n"
" in0=vload4(0,input+input_offset);\n"
" in1=vload4(0,input+input_offset+4);\n"
" in2=vload4(0,input+input_offset+8);\n"
" in3=vload4(0,input+input_offset+12);\n"
" } else{\n"
" if(out_c_idx+3<channel){\n"
" in0=vload4(0,input+input_offset);\n"
" in1=out_b_idx+1<bhw ? vload4(0,input+input_offset+4) : 0;\n"
" in2=out_b_idx+2<bhw ? vload4(0,input+input_offset+8) : 0;\n"
" in3=out_b_idx+3<bhw ? vload4(0,input+input_offset+12) : 0;\n"
" } else if(out_c_idx+1 == channel){\n"
" in0=(FLOAT4)(input[input_offset],0,0,0);\n"
" in1=out_b_idx+1<bhw ? (FLOAT4)(input[input_offset+4],0,0,0) : 0;\n"
" in2=out_b_idx+2<bhw ? (FLOAT4)(input[input_offset+8],0,0,0) : 0;\n"
" in3=out_b_idx+3<bhw ? (FLOAT4)(input[input_offset+12],0,0,0) : 0;\n"
" } else if(out_c_idx+2 == channel){\n"
" in0=(FLOAT4)(input[input_offset],input[input_offset+1],0,0);\n"
" in1=out_b_idx+1<bhw ? (FLOAT4)(input[input_offset+4],input[input_offset+5],0,0) : 0;\n"
" in2=out_b_idx+2<bhw ? (FLOAT4)(input[input_offset+8],input[input_offset+9],0,0) : 0;\n"
" in3=out_b_idx+3<bhw ? (FLOAT4)(input[input_offset+12],input[input_offset+13],0,0) : 0;\n"
" } else if(out_c_idx+3 == channel){\n"
" in0=(FLOAT4)(input[input_offset],input[input_offset+1],input[input_offset+2],0);\n"
" in1=out_b_idx+1<bhw ? (FLOAT4)(input[input_offset+4],input[input_offset+5],input[input_offset+6],0) : 0;\n"
" in2=out_b_idx+2<bhw ? (FLOAT4)(input[input_offset+8],input[input_offset+9],input[input_offset+10],0) : 0;\n"
" in3=out_b_idx+3<bhw ? (FLOAT4)(input[input_offset+12],input[input_offset+13],input[input_offset+14],0) : 0;\n"
" }\n"
" }\n"
" int out_offset=out_b_idx*channelAlign+out_c_idx;\n"
" vstore4(in0,0,output+out_offset);\n"
" vstore4(in1,0,output+out_offset+channelAlign);\n"
" vstore4(in2,0,output+out_offset+channelAlign+channelAlign);\n"
" vstore4(in3,0,output+out_offset+channelAlign+channelAlign+channelAlign);\n"
"}\n"
"__kernel void gemm_nhwc_to_c4nhw4(GLOBAL_SIZE_DIM2\n"
"__global const FLOAT* input,\n"
"__global FLOAT* output,\n"
"__private const int bhw,\n"
"__private const int channelAlign\n"
"){\n"
" const int x=get_global_id(0); //b/4\n"
" const int y=get_global_id(1); //c/4\n"
" UNIFORM_BOUNDRY_CHECK(x,y);\n"
" const int out_b_idx=x << 2;\n"
" const int out_c_idx=y << 2;\n"
" const int bhw4=bhw << 2;\n"
" const int input_offset=out_b_idx*channelAlign+out_c_idx;\n"
" FLOAT4 in0=vload4(0,input+input_offset);\n"
" FLOAT4 in1=vload4(0,input+input_offset+channelAlign);\n"
" FLOAT4 in2=vload4(0,input+input_offset+channelAlign+channelAlign);\n"
" FLOAT4 in3=vload4(0,input+input_offset+channelAlign+channelAlign+channelAlign);\n"
" int out_offset=y*bhw4+out_b_idx*4;\n"
" vstore4(in0,0,output+out_offset);\n"
" if(out_b_idx+1 >= bhw) return;\n"
" vstore4(in1,0,output+out_offset+4);\n"
" if(out_b_idx+2 >= bhw) return;\n"
" vstore4(in2,0,output+out_offset+8);\n"
" if(out_b_idx+3 >= bhw) return;\n"
" vstore4(in3,0,output+out_offset+12);\n"
"}\n"
"#define UCHAR4_TO_FLOAT8(b, scale, offset) "" wei.s0 = (COMPUTE_FLOAT)((b.s0 >> 4) - 8); "" wei.s1 = (COMPUTE_FLOAT)((b.s0 & 15) - 8); "" wei.s2 = (COMPUTE_FLOAT)((b.s1 >> 4) - 8); "" wei.s3 = (COMPUTE_FLOAT)((b.s1 & 15) - 8); "" wei.s4 = (COMPUTE_FLOAT)((b.s2 >> 4) - 8); "" wei.s5 = (COMPUTE_FLOAT)((b.s2 & 15) - 8); "" wei.s6 = (COMPUTE_FLOAT)((b.s3 >> 4) - 8); "" wei.s7 = (COMPUTE_FLOAT)((b.s3 & 15) - 8); "" wei=wei*scale+offset;\n"
"__kernel void gemm_b4_c8_int4_buf(GLOBAL_SIZE_DIM2\n"
" __global const FLOAT* input,\n"
"#ifdef USE_IMAGE\n"
" __read_only image2d_t weight,\n"
"#else\n"
" __global const uchar *weight,\n"
"#endif\n"
" __global const FLOAT *dequantScaleOffset,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT* output,\n"
" __private const int bhw,\n"
" __private const int dstChannelAlign,\n"
" __private const int srcChannelAlign,\n"
" __private const int blockNum,\n"
" __private const int blockDim,\n"
" __private const float coef) {\n"
" const int x=get_global_id(0); //b/4\n"
" const int y=get_global_id(1); //c/8\n"
" UNIFORM_BOUNDRY_CHECK(x,y);\n"
" \n"
" const int out_b_idx=x << 2;\n"
" const int out_c_idx=y << 1;\n"
" COMPUTE_FLOAT8 out0=CONVERT_COMPUTE_FLOAT8(vload8(0,bias+(out_c_idx << 2)));\n"
" COMPUTE_FLOAT8 out1=out0;\n"
" COMPUTE_FLOAT8 out2=out0;\n"
" COMPUTE_FLOAT8 out3=out0;\n"
" \n"
" const int bhw4=bhw << 2;\n"
" const int input_offset=out_b_idx*4;\n"
" int out_offset=out_c_idx*bhw4+out_b_idx*4;\n"
"#ifndef USE_IMAGE\n"
" const int weight_offset=y*srcChannelAlign*4;\n"
"#endif\n"
" const int loop=(blockDim+4-1)/4;\n"
"#if INPUT_CHANNEL_LEAVES_NUM != 0\n"
" const int loop_end=max(loop-1,0);\n"
"#else\n"
" const int loop_end=loop;\n"
"#endif\n"
"#if INPUT_BATCH_LEAVES_NUM != 0\n"
" if(out_b_idx+3 >= bhw){\n"
" for (int i=0; i<blockNum; i++){\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT8 scale,offset;\n"
" {\n"
" COMPUTE_FLOAT16 scaleOffset=CONVERT_COMPUTE_FLOAT16(convert_float16(vload16(0,dequantScaleOffset+(out_c_idx << 3)+i*dstChannelAlign*2))/coef);\n"
" scale=scaleOffset.s02468ace;\n"
" offset=scaleOffset.s13579bdf;\n"
" }\n"
" #else\n"
" COMPUTE_FLOAT8 scale=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(0,dequantScaleOffset+(out_c_idx << 2)+i*dstChannelAlign))/coef);\n"
" COMPUTE_FLOAT8 offset=0;\n"
" #endif\n"
" for (int j=0; j<loop_end; j++) {\n"
" int k=i*loop+j;\n"
" COMPUTE_FLOAT8 wei;\n"
" #ifdef USE_IMAGE\n"
" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(k,y)));\n"
" #else\n"
" uchar16 charWeightsInt40=vload16(k,weight+weight_offset);\n"
" #endif\n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k*bhw4));\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k*bhw4+4));\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" COMPUTE_FLOAT4 in2=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k*bhw4+8));\n"
" #endif\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.s0123,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in0.s0,wei,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s0,wei,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s0,wei,out2);\n"
" #endif\n"
" }\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.s4567,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in0.s1,wei,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s1,wei,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s1,wei,out2);\n"
" #endif\n"
" }\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.s89ab,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in0.s2,wei,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s2,wei,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s2,wei,out2);\n"
" #endif\n"
" }\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.scdef,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in0.s3,wei,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s3,wei,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s3,wei,out2);\n"
" #endif\n"
" }\n"
" }\n"
" #if INPUT_CHANNEL_LEAVES_NUM != 0\n"
" {\n"
" int k=i*loop+loop_end;\n"
" COMPUTE_FLOAT8 wei;\n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k*bhw4));\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k*bhw4+4));\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" COMPUTE_FLOAT4 in2=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k*bhw4+8));\n"
" #endif\n"
" #ifdef USE_IMAGE\n"
" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(k,y)));\n"
" #else\n"
" uchar16 charWeightsInt40=vload16(k,weight+weight_offset);\n"
" #endif\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.s0123,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in0.s0,wei,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s0,wei,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s0,wei,out2);\n"
" #endif\n"
" }\n"
" #if INPUT_CHANNEL_LEAVES_NUM >= 2\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.s4567,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in0.s1,wei,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s1,wei,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s1,wei,out2);\n"
" #endif\n"
" }\n"
" #endif\n"
" #if INPUT_CHANNEL_LEAVES_NUM >= 3\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.s89ab,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in0.s2,wei,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s2,wei,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s2,wei,out2);\n"
" #endif\n"
" }\n"
" #endif\n"
" }\n"
" #endif\n"
" }\n"
" } else {\n"
"#endif\n"
" for (int i=0; i<blockNum; i++){\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT8 scale,offset;\n"
" {\n"
" COMPUTE_FLOAT16 scaleOffset=CONVERT_COMPUTE_FLOAT16(convert_float16(vload16(0,dequantScaleOffset+(out_c_idx << 3)+i*dstChannelAlign*2))/coef);\n"
" scale=scaleOffset.s02468ace;\n"
" offset=scaleOffset.s13579bdf;\n"
" }\n"
" #else\n"
" COMPUTE_FLOAT8 scale=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(0,dequantScaleOffset+(out_c_idx << 2)+i*dstChannelAlign))/coef);\n"
" COMPUTE_FLOAT8 offset=0;\n"
" #endif\n"
" for (int j=0; j<loop_end; j++) {\n"
" int k=i*loop+j;\n"
" COMPUTE_FLOAT8 wei;\n"
" COMPUTE_FLOAT16 in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4));\n"
" #ifdef USE_IMAGE\n"
" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(k,y)));\n"
" #else\n"
" uchar16 charWeightsInt40=vload16(k,weight+weight_offset);\n"
" #endif\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.s0123,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s0,wei,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s4,wei,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.s8,wei,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.sc,wei,out3);\n"
" }\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.s4567,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s1,wei,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s5,wei,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.s9,wei,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.sd,wei,out3);\n"
" }\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.s89ab,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s2,wei,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s6,wei,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.sa,wei,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.se,wei,out3);\n"
" }\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.scdef,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s3,wei,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s7,wei,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.sb,wei,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.sf,wei,out3);\n"
" }\n"
" }\n"
" #if INPUT_CHANNEL_LEAVES_NUM != 0\n"
" {\n"
" int k=i*loop+loop_end;\n"
" COMPUTE_FLOAT8 wei;\n"
" COMPUTE_FLOAT16 in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4));\n"
" #ifdef USE_IMAGE\n"
" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(k,y)));\n"
" #else\n"
" uchar16 charWeightsInt40=vload16(k,weight+weight_offset);\n"
" #endif\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.s0123,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s0,wei,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s4,wei,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.s8,wei,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.sc,wei,out3);\n"
" }\n"
" #if INPUT_CHANNEL_LEAVES_NUM >= 2\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.s4567,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s1,wei,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s5,wei,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.s9,wei,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.sd,wei,out3);\n"
" }\n"
" #endif\n"
" #if INPUT_CHANNEL_LEAVES_NUM >= 3\n"
" {\n"
" UCHAR4_TO_FLOAT8(charWeightsInt40.s89ab,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s2,wei,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s6,wei,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.sa,wei,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.se,wei,out3);\n"
" }\n"
" #endif\n"
" }\n"
" #endif\n"
" }\n"
"#if INPUT_BATCH_LEAVES_NUM != 0\n"
" }\n"
"#endif\n"
" \n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT8)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT8)0);\n"
" out2=fmax(out2,(COMPUTE_FLOAT8)0);\n"
" out3=fmax(out3,(COMPUTE_FLOAT8)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
" out2=clamp(out2,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
" out3=clamp(out3,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
"#endif\n"
"#if INPUT_BATCH_LEAVES_NUM != 0\n"
" if(out_b_idx+3 >= bhw){\n"
" #if INPUT_BATCH_LEAVES_NUM == 3\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0.s0123,out1.s0123)),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out2.s0123),0,output+out_offset+8);\n"
" if((out_c_idx << 2)+4<dstChannelAlign){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0.s4567,out1.s4567)),0,output+out_offset+bhw4);\n"
" vstore4(CONVERT_FLOAT4(out2.s4567),0,output+out_offset+bhw4+8);\n"
" }\n"
" #elif INPUT_BATCH_LEAVES_NUM == 2\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0.s0123,out1.s0123)),0,output+out_offset);\n"
" if((out_c_idx << 2)+4<dstChannelAlign){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0.s4567,out1.s4567)),0,output+out_offset+bhw4);\n"
" }\n"
" #elif INPUT_BATCH_LEAVES_NUM == 1\n"
" vstore4(CONVERT_FLOAT4(out0.s0123),0,output+out_offset);\n"
" if((out_c_idx << 2)+4<dstChannelAlign){\n"
" vstore4(CONVERT_FLOAT4(out0.s4567),0,output+out_offset+bhw4);\n"
" }\n"
" #endif\n"
" }else{\n"
"#endif\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0.s0123,out1.s0123,out2.s0123,out3.s0123)),0,output+out_offset);\n"
" if((out_c_idx << 2)+4<dstChannelAlign){\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0.s4567,out1.s4567,out2.s4567,out3.s4567)),0,output+out_offset+bhw4);\n"
" }\n"
"#if INPUT_BATCH_LEAVES_NUM != 0\n"
" }\n"
"#endif\n"
"}\n"
"__kernel void gemm_b4_c8_int8_buf(GLOBAL_SIZE_DIM2\n"
" __global const FLOAT* input,\n"
"#ifdef USE_IMAGE\n"
" __read_only image2d_t weight,\n"
"#else\n"
" __global const char *weight,\n"
"#endif\n"
" __global const FLOAT *dequantScaleOffset,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT* output,\n"
" __private const int bhw,\n"
" __private const int dstChannelAlign,\n"
" __private const int srcChannelAlign,\n"
" __private const int blockNum,\n"
" __private const int blockDim,\n"
" __private const float coef) {\n"
" const int x=get_global_id(0); //b/4\n"
" const int y=get_global_id(1); //c/8\n"
" UNIFORM_BOUNDRY_CHECK(x,y);\n"
" \n"
" const int out_b_idx=x << 2;\n"
" const int out_c_idx=y << 1;\n"
" COMPUTE_FLOAT8 out0=CONVERT_COMPUTE_FLOAT8(vload8(0,bias+(out_c_idx << 2)));\n"
" COMPUTE_FLOAT8 out1=out0;\n"
" COMPUTE_FLOAT8 out2=out0;\n"
" COMPUTE_FLOAT8 out3=out0;\n"
" \n"
" const int bhw4=bhw << 2;\n"
" const int input_offset=out_b_idx*4;\n"
" int out_offset=out_c_idx*bhw4+out_b_idx*4;\n"
"#ifndef USE_IMAGE\n"
" const int weight_offset=y*srcChannelAlign*8;\n"
"#endif\n"
" const int loop=(blockDim+4-1)/4;\n"
"#if INPUT_CHANNEL_LEAVES_NUM != 0\n"
" const int loop_end=max(loop-1,0);\n"
"#else\n"
" const int loop_end=loop;\n"
"#endif\n"
"#if INPUT_BATCH_LEAVES_NUM != 0\n"
" if(out_b_idx+3 >= bhw){\n"
" for (int i=0; i<blockNum; i++){\n"
" COMPUTE_FLOAT16 scale,offset;\n"
" {\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT16 scaleOffset=CONVERT_COMPUTE_FLOAT16(convert_float16(vload16(0,dequantScaleOffset+(out_c_idx << 3)+i*dstChannelAlign*2))/coef);\n"
" scale=(COMPUTE_FLOAT16)(scaleOffset.s02468ace,scaleOffset.s02468ace);\n"
" offset=(COMPUTE_FLOAT16)(scaleOffset.s13579bdf,scaleOffset.s13579bdf);\n"
" #else\n"
" scale.s01234567=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(0,dequantScaleOffset+(out_c_idx << 2)+i*dstChannelAlign))/coef);\n"
" scale.s89abcdef=scale.s01234567;\n"
" offset=0;\n"
" #endif\n"
" }\n"
" for (int j=0; j<loop_end; j++) {\n"
" int k=i*loop+j;\n"
" int k2=k << 1;\n"
" #ifdef USE_IMAGE\n"
" COMPUTE_FLOAT16 wei0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(k2,y))))*scale+offset;\n"
" COMPUTE_FLOAT16 wei1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(k2+1,y))))*scale+offset;\n"
" #else\n"
" COMPUTE_FLOAT16 wei0=CONVERT_COMPUTE_FLOAT16(vload16(k2,weight+weight_offset))*scale+offset;\n"
" COMPUTE_FLOAT16 wei1=CONVERT_COMPUTE_FLOAT16(vload16(k2+1,weight+weight_offset))*scale+offset;\n"
" #endif\n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k*bhw4));\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k*bhw4+4));\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" COMPUTE_FLOAT4 in2=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k*bhw4+8));\n"
" #endif\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in0.s0,wei0.s01234567,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s0,wei0.s01234567,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s0,wei0.s01234567,out2);\n"
" #endif\n"
" }\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in0.s1,wei0.s89abcdef,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s1,wei0.s89abcdef,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s1,wei0.s89abcdef,out2);\n"
" #endif\n"
" }\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in0.s2,wei1.s01234567,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s2,wei1.s01234567,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s2,wei1.s01234567,out2);\n"
" #endif\n"
" }\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in0.s3,wei1.s89abcdef,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s3,wei1.s89abcdef,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s3,wei1.s89abcdef,out2);\n"
" #endif\n"
" }\n"
" }\n"
" #if INPUT_CHANNEL_LEAVES_NUM != 0\n"
" {\n"
" int k=i*loop+loop_end;\n"
" int k2=k << 1;\n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k*bhw4));\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k*bhw4+4));\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" COMPUTE_FLOAT4 in2=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k*bhw4+8));\n"
" #endif\n"
" #ifdef USE_IMAGE\n"
" COMPUTE_FLOAT16 wei0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(k2,y))))*scale+offset;\n"
" COMPUTE_FLOAT16 wei1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(k2+1,y))))*scale+offset;\n"
" #else\n"
" COMPUTE_FLOAT16 wei0=CONVERT_COMPUTE_FLOAT16(vload16(k2,weight+weight_offset))*scale+offset;\n"
" COMPUTE_FLOAT16 wei1=CONVERT_COMPUTE_FLOAT16(vload16(k2+1,weight+weight_offset))*scale+offset;\n"
" #endif\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in0.s0,wei0.s01234567,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s0,wei0.s01234567,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s0,wei0.s01234567,out2);\n"
" #endif\n"
" }\n"
" #if INPUT_CHANNEL_LEAVES_NUM >= 2\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in0.s1,wei0.s89abcdef,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s1,wei0.s89abcdef,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s1,wei0.s89abcdef,out2);\n"
" #endif\n"
" }\n"
" #endif\n"
" #if INPUT_CHANNEL_LEAVES_NUM >= 3\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in0.s2,wei1.s01234567,out0);\n"
" #if INPUT_BATCH_LEAVES_NUM >= 2\n"
" out1=mad((COMPUTE_FLOAT8)in1.s2,wei1.s01234567,out1);\n"
" #endif\n"
" #if INPUT_BATCH_LEAVES_NUM >= 3\n"
" out2=mad((COMPUTE_FLOAT8)in2.s2,wei1.s01234567,out2);\n"
" #endif\n"
" }\n"
" #endif\n"
" }\n"
" #endif\n"
" }\n"
" } else {\n"
"#endif\n"
" for (int i=0; i<blockNum; i++){\n"
" COMPUTE_FLOAT16 scale,offset;\n"
" {\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT16 scaleOffset=CONVERT_COMPUTE_FLOAT16(convert_float16(vload16(0,dequantScaleOffset+(out_c_idx << 3)+i*dstChannelAlign*2))/coef);\n"
" scale=(COMPUTE_FLOAT16)(scaleOffset.s02468ace,scaleOffset.s02468ace);\n"
" offset=(COMPUTE_FLOAT16)(scaleOffset.s13579bdf,scaleOffset.s13579bdf);\n"
" #else\n"
" scale.s01234567=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(0,dequantScaleOffset+(out_c_idx << 2)+i*dstChannelAlign))/coef);\n"
" scale.s89abcdef=scale.s01234567;\n"
" offset=0;\n"
" #endif\n"
" }\n"
" for (int j=0; j<loop_end; j++) {\n"
" int k=i*loop+j;\n"
" int k2=k << 1;\n"
" COMPUTE_FLOAT16 in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4));\n"
" #ifdef USE_IMAGE\n"
" COMPUTE_FLOAT16 wei0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(k2,y))))*scale+offset;\n"
" COMPUTE_FLOAT16 wei1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(k2+1,y))))*scale+offset;\n"
" #else\n"
" COMPUTE_FLOAT16 wei0=CONVERT_COMPUTE_FLOAT16(vload16(k2,weight+weight_offset))*scale+offset;\n"
" COMPUTE_FLOAT16 wei1=CONVERT_COMPUTE_FLOAT16(vload16(k2+1,weight+weight_offset))*scale+offset;\n"
" #endif\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in.s0,wei0.s01234567,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s4,wei0.s01234567,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.s8,wei0.s01234567,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.sc,wei0.s01234567,out3);\n"
" }\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in.s1,wei0.s89abcdef,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s5,wei0.s89abcdef,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.s9,wei0.s89abcdef,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.sd,wei0.s89abcdef,out3);\n"
" }\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in.s2,wei1.s01234567,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s6,wei1.s01234567,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.sa,wei1.s01234567,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.se,wei1.s01234567,out3);\n"
" }\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in.s3,wei1.s89abcdef,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s7,wei1.s89abcdef,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.sb,wei1.s89abcdef,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.sf,wei1.s89abcdef,out3);\n"
" }\n"
" }\n"
" #if INPUT_CHANNEL_LEAVES_NUM != 0\n"
" {\n"
" int k=i*loop+loop_end;\n"
" int k2=k << 1;\n"
" COMPUTE_FLOAT16 in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4));\n"
" #ifdef USE_IMAGE\n"
" COMPUTE_FLOAT16 wei0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(k2,y))))*scale+offset;\n"
" COMPUTE_FLOAT16 wei1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(k2+1,y))))*scale+offset;\n"
" #else\n"
" COMPUTE_FLOAT16 wei0=CONVERT_COMPUTE_FLOAT16(vload16(k2,weight+weight_offset))*scale+offset;\n"
" COMPUTE_FLOAT16 wei1=CONVERT_COMPUTE_FLOAT16(vload16(k2+1,weight+weight_offset))*scale+offset;\n"
" #endif\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in.s0,wei0.s01234567,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s4,wei0.s01234567,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.s8,wei0.s01234567,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.sc,wei0.s01234567,out3);\n"
" }\n"
" #if INPUT_CHANNEL_LEAVES_NUM >= 2\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in.s1,wei0.s89abcdef,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s5,wei0.s89abcdef,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.s9,wei0.s89abcdef,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.sd,wei0.s89abcdef,out3);\n"
" }\n"
" #endif\n"
" #if INPUT_CHANNEL_LEAVES_NUM >= 3\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in.s2,wei1.s01234567,out0);\n"
" out1=mad((COMPUTE_FLOAT8)in.s6,wei1.s01234567,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in.sa,wei1.s01234567,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in.se,wei1.s01234567,out3);\n"
" }\n"
" #endif\n"
" }\n"
" #endif\n"
" }\n"
"#if INPUT_BATCH_LEAVES_NUM != 0\n"
" }\n"
"#endif\n"
" \n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT8)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT8)0);\n"
" out2=fmax(out2,(COMPUTE_FLOAT8)0);\n"
" out3=fmax(out3,(COMPUTE_FLOAT8)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
" out2=clamp(out2,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
" out3=clamp(out3,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
"#endif\n"
"#if INPUT_BATCH_LEAVES_NUM != 0\n"
" if(out_b_idx+3 >= bhw){\n"
" #if INPUT_BATCH_LEAVES_NUM == 3\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0.s0123,out1.s0123)),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out2.s0123),0,output+out_offset+8);\n"
" if((out_c_idx << 2)+4<dstChannelAlign){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0.s4567,out1.s4567)),0,output+out_offset+bhw4);\n"
" vstore4(CONVERT_FLOAT4(out2.s4567),0,output+out_offset+bhw4+8);\n"
" }\n"
" #elif INPUT_BATCH_LEAVES_NUM == 2\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0.s0123,out1.s0123)),0,output+out_offset);\n"
" if((out_c_idx << 2)+4<dstChannelAlign){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0.s4567,out1.s4567)),0,output+out_offset+bhw4);\n"
" }\n"
" #elif INPUT_BATCH_LEAVES_NUM == 1\n"
" vstore4(CONVERT_FLOAT4(out0.s0123),0,output+out_offset);\n"
" if((out_c_idx << 2)+4<dstChannelAlign){\n"
" vstore4(CONVERT_FLOAT4(out0.s4567),0,output+out_offset+bhw4);\n"
" }\n"
" #endif\n"
" }else{\n"
"#endif\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0.s0123,out1.s0123,out2.s0123,out3.s0123)),0,output+out_offset);\n"
" if((out_c_idx << 2)+4<dstChannelAlign){\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0.s4567,out1.s4567,out2.s4567,out3.s4567)),0,output+out_offset+bhw4);\n"
" }\n"
"#if INPUT_BATCH_LEAVES_NUM != 0\n"
" }\n"
"#endif\n"
"}\n"
;
#endif
}
